{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5c5a740a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "glove_path = r\"E:\\浏览器\\glove.6B.100d.txt\"  \n",
    "\n",
    "model = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "01a2e4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124362826347351),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751145362854),\n",
      " ('starring', 0.7573285102844238),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389974594116)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.781263530254364),\n",
      " ('motorcycle', 0.7553156614303589),\n",
      " ('vehicles', 0.7462257146835327),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.737270712852478),\n",
      " ('taxi', 0.7155269384384155)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8b62f7ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "590fc408",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "#import nltk\n",
    "#nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "efc882de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(76044, 2) [[  4  16]\n",
      " [  4  19]\n",
      " [ 16   4]\n",
      " ...\n",
      " [130   3]\n",
      " [  3  86]\n",
      " [  3 130]]\n"
     ]
    }
   ],
   "source": [
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "30903b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1d9da6c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 5.43983724e-02 5.34295679e-02 ... 9.68804495e-05\n",
      " 9.68804495e-05 9.68804495e-05]\n",
      "vocab_size = 1078, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch-99, loss=2.9676: 100%|█| 100/100 [03:58<00:00,  2.39s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjMAAAGwCAYAAABcnuQpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAABKMklEQVR4nO3deXiU1f3+8XuyTfZAAiQEwr6LyKYIqKCyKIgLtS6gYm3rghtSxVLan1hbUNoitih+XYq0SrG2SKkrwSKCyL5FdiSQAAmBELJnkpk5vz9ChoQshGQWBt6v65rrYp7nmcnhEDJ3zvmc81iMMUYAAAB+KsDXDQAAAGgMwgwAAPBrhBkAAODXCDMAAMCvEWYAAIBfI8wAAAC/RpgBAAB+LcjXDfA0p9Opo0ePKioqShaLxdfNAQAA9WCMUX5+vhITExUQUPfYy0UfZo4ePaqkpCRfNwMAADRAenq6WrduXec1F32YiYqKklTeGdHR0T5uDQAAqI+8vDwlJSW5PsfrctGHmYqppejoaMIMAAB+pj4lIhQAAwAAv0aYAQAAfo0wAwAA/JpPw8z06dNlsViqPBISElznjTGaPn26EhMTFRYWpqFDh2rHjh0+bDEAALjQ+Hxk5rLLLlNGRobrkZKS4jo3a9YszZ49W3PnztWGDRuUkJCg4cOHKz8/34ctBgAAFxKfh5mgoCAlJCS4Hs2bN5dUPiozZ84cTZs2TWPHjlXPnj21YMECFRUVaeHChT5uNQAAuFD4PMzs27dPiYmJat++ve655x4dOHBAkpSamqrMzEyNGDHCda3VatWQIUO0Zs2aWt/PZrMpLy+vygMAAFy8fBpmBgwYoL/97W/68ssv9fbbbyszM1ODBg1Sdna2MjMzJUnx8fFVXhMfH+86V5OZM2cqJibG9WD3XwAALm4+DTM333yzfvSjH+nyyy/XsGHD9Omnn0qSFixY4Lrm7M1yjDF1bqAzdepU5ebmuh7p6emeaTwAALgg+HyaqbKIiAhdfvnl2rdvn2tV09mjMFlZWdVGayqzWq2u3X7Z9RcAgIvfBRVmbDabdu3apZYtW6p9+/ZKSEhQcnKy63xpaalWrlypQYMG+bCVAADgQuLTezM9++yzGjNmjNq0aaOsrCz97ne/U15eniZMmCCLxaJJkyZpxowZ6ty5szp37qwZM2YoPDxc48aN82WzAQDABcSnYebw4cO69957deLECTVv3lxXX3211q5dq7Zt20qSpkyZouLiYk2cOFE5OTkaMGCAli1bVq87aHpaUaldOUVlCgkMUPMoq6+bAwDAJctijDG+boQn5eXlKSYmRrm5uW6tn5mzfK/mLN+ncQPaaMYdl7vtfQEAwPl9fl9QNTP+JNJaPqhVaLP7uCUAAFzaCDMNFB5SEWYcPm4JAACXNsJMA0VYAyUxMgMAgK8RZhqoYpqpqJQwAwCALxFmGqhimqmAkRkAAHyKMNNAZ0ZmqJkBAMCXCDMNFH66ZoaRGQAAfIsw00CVl2Zf5Fv1AABwQSPMNFB4SPnIjNNINrvTx60BAODSRZhpoIoCYImpJgAAfIkw00CBARaFBZePzhSxcR4AAD5DmGmECCvLswEA8DXCTCNEnl7RxMZ5AAD4DmGmEdg4DwAA3yPMNAIb5wEA4HuEmUZg4zwAAHyPMNMIEZU2zgMAAL5BmGmEiJCKAmCmmQAA8BXCTCOwNBsAAN8jzDRCxOnVTEWEGQAAfIYw0whnRmaYZgIAwFcIM43ApnkAAPgeYaYR2DQPAADfI8w0QgSb5gEA4HOEmUaIOD3NxD4zAAD4DmGmEViaDQCA7xFmGsG1NJtpJgAAfIYw0wgR3JsJAACfI8w0QsVds0vtTpU5nD5uDQAAlybCTCNULM2WpCI2zgMAwCcIM40QEhSgkMDyLixk4zwAAHyCMNNI4SzPBgDApwgzjVSxoqmQFU0AAPgEYaaR2DgPAADfIsw0EhvnAQDgW4SZRjqzcR5hBgAAXyDMNNKZjfOomQEAwBcIM43kunM200wAAPgEYaaRXKuZCDMAAPgEYaaRKkZmWJoNAIBvEGYaKSKEpdkAAPgSYaaRGJkBAMC3CDONxKZ5AAD4FmGmkdg0DwAA3yLMNBKb5gEA4FuEmUZy1cywaR4AAD5BmGkkamYAAPAtwkwjsWkeAAC+RZhpJNftDMoccjqNj1sDAMClhzDTSBXTTMZIxWXUzQAA4G2EmUYKCw6UxVL+Z6aaAADwPsJMI1ksljN1M+wCDACA1xFm3IAVTQAA+A5hxg1Y0QQAgO8QZtzgzM0mCTMAAHgbYcYNzkwzUTMDAIC3EWbcgGkmAAB8hzDjBmemmRiZAQDA2wgzbsBqJgAAfIcw4wZMMwEA4DuEGTcIZzUTAAA+c8GEmZkzZ8pisWjSpEmuY8YYTZ8+XYmJiQoLC9PQoUO1Y8cO3zWyFpGsZgIAwGcuiDCzYcMGvfXWW+rVq1eV47NmzdLs2bM1d+5cbdiwQQkJCRo+fLjy8/N91NKauQqAmWYCAMDrfB5mCgoKNH78eL399ttq2rSp67gxRnPmzNG0adM0duxY9ezZUwsWLFBRUZEWLlzowxZXd+beTIQZAAC8zedh5vHHH9fo0aM1bNiwKsdTU1OVmZmpESNGuI5ZrVYNGTJEa9asqfX9bDab8vLyqjw87czIDNNMAAB4W5Avv/iiRYu0efNmbdiwodq5zMxMSVJ8fHyV4/Hx8Tp06FCt7zlz5ky9+OKL7m3oOUSEsDQbAABf8dnITHp6up5++mm9//77Cg0NrfU6i8VS5bkxptqxyqZOnarc3FzXIz093W1trk3FyEwRm+YBAOB1PhuZ2bRpk7KystSvXz/XMYfDoW+++UZz587Vnj17JJWP0LRs2dJ1TVZWVrXRmsqsVqusVqvnGl6Dik3zChiZAQDA63w2MnPjjTcqJSVFW7dudT369++v8ePHa+vWrerQoYMSEhKUnJzsek1paalWrlypQYMG+arZNaq8mskY4+PWAABwafHZyExUVJR69uxZ5VhERITi4uJcxydNmqQZM2aoc+fO6ty5s2bMmKHw8HCNGzfOF02uVfjp1Ux2p1GpwylrUKCPWwQAwKXDpwXA5zJlyhQVFxdr4sSJysnJ0YABA7Rs2TJFRUX5umlVVBQAS+UrmggzAAB4j8Vc5PMieXl5iomJUW5urqKjoz32dbr95nOVlDm1asr1SooN99jXAQDgUnA+n98+32fmYsHGeQAA+AZhxk3YOA8AAN8gzLhJOBvnAQDgE4QZN4l0bZxHmAEAwJsIM24SfjrMFDDNBACAVxFm3CTSyjQTAAC+QJhxk3BWMwEA4BOEGTeJrHRLAwAA4D2EGTeJcE0zUTMDAIA3EWbcxDXNxMgMAABeRZhxkzNLsxmZAQDAmwgzblKxaV4BIzMAAHgVYcZN2DQPAADfIMy4CZvmAQDgG4QZN2HTPAAAfIMw4yYVq5mYZgIAwLsIM24S6ZpmIswAAOBNhBk3iTgdZkrKnHI4jY9bAwDApYMw4yYVS7Ml7s8EAIA3EWbcxBoUoKAAiySpiBVNAAB4DWHGTSwWCxvnAQDgA4QZNwo7HWZKyhiZAQDAWwgzbmQNKg8zpQ6nj1sCAMClgzDjRiFB5d1pKyPMAADgLYQZN7JWhBk700wAAHgLYcaNzoQZRmYAAPAWwowbVdTMEGYAAPAewowbWYMramaYZgIAwFsIM25UMc3EaiYAALyHMONGIRXTTKxmAgDAawgzbkQBMAAA3keYcSOWZgMA4H2EGTdiNRMAAN5HmHGjM6uZCDMAAHgLYcaNmGYCAMD7CDNu5LrRJNNMAAB4DWHGjUJYzQQAgNcRZtyIaSYAALyPMONG7DMDAID3EWbcyBrMDsAAAHgbYcaNmGYCAMD7CDNuxI0mAQDwPsKMG7lWMzHNBACA1xBm3IjbGQAA4H2EGTeiZgYAAO8jzLhRaDBLswEA8DbCjBu5ppmomQEAwGsIM27EaiYAALyPMONGFauZHE4jO4EGAACvIMy4UcU0k0TdDAAA3kKYcaOKkRmJMAMAgLcQZtwoMMCi4ECLJJZnAwDgLYQZN2NFEwAA3kWYcbMzG+cRZgAA8AbCjJtV1M2UEmYAAPAKwoybcUsDAAC8izDjZtxsEgAA7yLMuJk1mJEZAAC8iTDjZq5pJlYzAQDgFYQZN2OaCQAA7yLMuBmrmQAA8C6fhpl58+apV69eio6OVnR0tAYOHKjPP//cdd4Yo+nTpysxMVFhYWEaOnSoduzY4cMWnxurmQAA8C6fhpnWrVvr5Zdf1saNG7Vx40bdcMMNuu2221yBZdasWZo9e7bmzp2rDRs2KCEhQcOHD1d+fr4vm10nNs0DAMC7fBpmxowZo1GjRqlLly7q0qWLfv/73ysyMlJr166VMUZz5szRtGnTNHbsWPXs2VMLFixQUVGRFi5c6Mtm14maGQAAvOuCqZlxOBxatGiRCgsLNXDgQKWmpiozM1MjRoxwXWO1WjVkyBCtWbOm1vex2WzKy8ur8vAm19LsMqaZAADwBp+HmZSUFEVGRspqterRRx/Vxx9/rB49eigzM1OSFB8fX+X6+Ph417mazJw5UzExMa5HUlKSR9t/NqaZAADwLp+Hma5du2rr1q1au3atHnvsMU2YMEE7d+50nbdYLFWuN8ZUO1bZ1KlTlZub63qkp6d7rO01CSHMAADgVUG+bkBISIg6deokSerfv782bNig1157Tc8//7wkKTMzUy1btnRdn5WVVW20pjKr1Sqr1erZRteBmhkAALzL5yMzZzPGyGazqX379kpISFBycrLrXGlpqVauXKlBgwb5sIV1Y2k2AADe5dORmV/96le6+eablZSUpPz8fC1atEhff/21vvjiC1ksFk2aNEkzZsxQ586d1blzZ82YMUPh4eEaN26cL5tdJ2pmAADwLp+GmWPHjun+++9XRkaGYmJi1KtXL33xxRcaPny4JGnKlCkqLi7WxIkTlZOTowEDBmjZsmWKioryZbPrZA0+Pc3EvZkAAPAKn4aZd999t87zFotF06dP1/Tp073TIDdgmgkAAO+64Gpm/F1FATD3ZgIAwDsIM27G0mwAALyLMONmFAADAOBdhBk3o2YGAADvIsy4GauZAADwLsKMmzHNBACAdxFm3IxpJgAAvIsw42YVq5lYmg0AgHcQZtys8o0mjTE+bg0AABc/woybWYPPdGmpg9EZAAA8jTDjZhU1MxJFwAAAeANhxs1CAiuFGZZnAwDgcYQZN7NYLKxoAgDAiwgzHsCKJgAAvIcw4wGVVzQBAADPalCYWbBggT799FPX8ylTpqhJkyYaNGiQDh065LbG+St2AQYAwHsaFGZmzJihsLAwSdJ3332nuXPnatasWWrWrJmeeeYZtzbQH1Usz7aVUTMDAICnBTXkRenp6erUqZMkacmSJbrzzjv18MMPa/DgwRo6dKg72+eXmGYCAMB7GjQyExkZqezsbEnSsmXLNGzYMElSaGioiouL3dc6P8U0EwAA3tOgkZnhw4frZz/7mfr06aO9e/dq9OjRkqQdO3aoXbt27myfX2I1EwAA3tOgkZnXX39dAwcO1PHjx/Xvf/9bcXFxkqRNmzbp3nvvdWsD/RH7zAAA4D0NGplp0qSJ5s6dW+34iy++2OgGXQyomQEAwHsaNDLzxRdfaPXq1a7nr7/+unr37q1x48YpJyfHbY3zV6xmAgDAexoUZp577jnl5eVJklJSUvSLX/xCo0aN0oEDBzR58mS3NtAfUQAMAID3NGiaKTU1VT169JAk/fvf/9Ytt9yiGTNmaPPmzRo1apRbG+iPmGYCAMB7GjQyExISoqKiIknS8uXLNWLECElSbGysa8TmUkYBMAAA3tOgkZlrrrlGkydP1uDBg7V+/Xp9+OGHkqS9e/eqdevWbm2gP7KyNBsAAK9p0MjM3LlzFRQUpH/961+aN2+eWrVqJUn6/PPPddNNN7m1gf6ImhkAALynQSMzbdq00SeffFLt+KuvvtroBl0MrMGna2bKCDMAAHhag8KMJDkcDi1ZskS7du2SxWJR9+7dddtttykwMNCd7fNL1MwAAOA9DQoz+/fv16hRo3TkyBF17dpVxhjt3btXSUlJ+vTTT9WxY0d3t9OvMM0EAID3NKhm5qmnnlLHjh2Vnp6uzZs3a8uWLUpLS1P79u311FNPubuNfoel2QAAeE+DRmZWrlyptWvXKjY21nUsLi5OL7/8sgYPHuy2xvkrbjQJAID3NGhkxmq1Kj8/v9rxgoIChYSENLpR/o6aGQAAvKdBYeaWW27Rww8/rHXr1skYI2OM1q5dq0cffVS33nqru9vod1z3ZmJkBgAAj2tQmPnzn/+sjh07auDAgQoNDVVoaKgGDRqkTp06ac6cOW5uov9x1cywNBsAAI9rUM1MkyZN9J///Ef79+/Xrl27ZIxRjx491KlTJ3e3zy8xzQQAgPfUO8yc627YX3/9tevPs2fPbnCDLgasZgIAwHvqHWa2bNlSr+ssFkuDG3OxYDUTAADeU+8ws2LFCk+246LCpnkAAHhPgwqAUbczq5momQEAwNMIMx5QUTNT5jByOI2PWwMAwMWNMOMBFdNMEnUzAAB4GmHGAyqHGaaaAADwLMKMBwQFBijg9KIuioABAPAswoyHVNTNMM0EAIBnEWY8hBVNAAB4B2HGQyrqZkq4PxMAAB5FmPEQbmkAAIB3EGY8hJtNAgDgHYQZDwnhlgYAAHgFYcZDrNxsEgAAryDMeAg1MwAAeAdhxkNcS7PLqJkBAMCTCDMeYqVmBgAAryDMeAjTTAAAeAdhxkNCWJoNAIBXEGY8hNVMAAB4B2HGQ5hmAgDAOwgzHnJmNRNhBgAATyLMeAi3MwAAwDt8GmZmzpypK6+8UlFRUWrRooVuv/127dmzp8o1xhhNnz5diYmJCgsL09ChQ7Vjxw4ftbj+mGYCAMA7fBpmVq5cqccff1xr165VcnKy7Ha7RowYocLCQtc1s2bN0uzZszV37lxt2LBBCQkJGj58uPLz833Y8nPj3kwAAHhHkC+/+BdffFHl+fz589WiRQtt2rRJ1113nYwxmjNnjqZNm6axY8dKkhYsWKD4+HgtXLhQjzzySLX3tNlsstlsrud5eXme/UvU4sxqJqaZAADwpAuqZiY3N1eSFBsbK0lKTU1VZmamRowY4brGarVqyJAhWrNmTY3vMXPmTMXExLgeSUlJnm94DdgBGAAA77hgwowxRpMnT9Y111yjnj17SpIyMzMlSfHx8VWujY+Pd50729SpU5Wbm+t6pKene7bhtbAGn66ZYTUTAAAe5dNppsqeeOIJbd++XatXr652zmKxVHlujKl2rILVapXVavVIG88Hq5kAAPCOC2Jk5sknn9TSpUu1YsUKtW7d2nU8ISFBkqqNwmRlZVUbrbnQMM0EAIB3+DTMGGP0xBNPaPHixfrf//6n9u3bVznfvn17JSQkKDk52XWstLRUK1eu1KBBg7zd3PPC0mwAALzDp9NMjz/+uBYuXKj//Oc/ioqKco3AxMTEKCwsTBaLRZMmTdKMGTPUuXNnde7cWTNmzFB4eLjGjRvny6afEzeaBADAO3waZubNmydJGjp0aJXj8+fP14MPPihJmjJlioqLizVx4kTl5ORowIABWrZsmaKiorzc2vPDjSYBAPAOn4YZY8w5r7FYLJo+fbqmT5/u+Qa5UWgwNTMAAHjDBVEAfDFy1cywNBsAAI8izHhI5aXZ9RmBAgAADUOY8ZCKkRmnkexOwgwAAJ5CmPGQitVMEnUzAAB4EmHGQyqHGVY0AQDgOYQZDwkMsCg4sPyWC+w1AwCA5xBmPIgVTQAAeB5hxoO4PxMAAJ5HmPEg7pwNAIDnEWY8KISRGQAAPI4w40EVNTOsZgIAwHMIMx5kDWaaCQAATyPMeJCrZobVTAAAeAxhxoNcS7OZZgIAwGMIMx7EaiYAADyPMONBrGYCAMDzCDMeRM0MAACeR5jxINfSbAdhBgAATyHMeJBraXYZNTMAAHgKYcaDuDcTAACeR5jxIJZmAwDgeYQZDwphaTYAAB5HmPEgVjMBAOB5hBkPcoUZVjMBAOAxhBkPsgafrplhZAYAAI8hzHhQ6Oml2UWldh+3BACAixdhxoOaR4ZKkrLybT5uCQAAFy/CjAclxFglScdyS3zcEgAALl6EGQ+Kjy4fmcm32VVoY6oJAABPIMx4UFRosCKtQZKkzDxGZwAA8ATCjIfFRzPVBACAJxFmPCwhpnyqiZEZAAA8gzDjYRV1MxmMzAAA4BGEGQ9LOB1mjjEyAwCARxBmPMw1zcTIDAAAHkGY8bB4RmYAAPAowoyHtaQAGAAAjyLMeFhFzczxfJvs3D0bAAC3I8x4WFykVYEBFjmNdKKg1NfNAQDgokOY8bDAAItaRJVvnMdUEwAA7keY8YKKIuDM3GIftwQAgIsPYcYLEqJZng0AgKcQZrzgzC0NbD5uCQAAFx/CjBdUhBn2mgEAwP0IM17ANBMAAJ5DmPECdgEGAMBzCDNekFBpF2BjjI9bAwDAxYUw4wUV00xFpQ7l2+w+bg0AABcXwowXhIUEKjo0SBJ1MwAAuBthxktcU031CDNHTxXrH+vT5HAyJQUAwLkE+boBl4qEmDDtPVZQr1saTPs4RSv2HFdocIDu6NPaC60DAMB/MTLjJQnR5fdnOnaOkZlSu1NrD5yUJO3JLPB4uwAA8HeEGS9x7TVzjpGZlCOnVFzmkCQdyi70eLsAAPB3hBkvia/nLsAVozKSdCi7yKNtAgDgYkCY8ZL6jsysPZDt+nPaySL2pQEA4BwIM14S77qlQe03myy1O7XxYI7reYHNruzCUo+3DQAAf0aY8ZKKpdnZhTaV2p01XlNRLxMbEaKWp69nqgkAgLoRZrwkNjxEwYEWGSNl5dc81VRRLzOgfazaxoVLktJOUgQMAEBdCDNeEhBgOecNJyvqZa7uEKd2cRGSpIMnGJkBAKAuhBkvSqijbqZyvczVHeLUxjUyQ5gBAKAuhBkvio+pfUVT5XqZzi0i1Tb29MgMe80AAFAnn4aZb775RmPGjFFiYqIsFouWLFlS5bwxRtOnT1diYqLCwsI0dOhQ7dixwzeNdYOEOqaZKtfLBARYztTMUAAMAECdfBpmCgsLdcUVV2ju3Lk1np81a5Zmz56tuXPnasOGDUpISNDw4cOVn5/v5Za6x5lppprCzJl6GUmuMJNdWKr8kjIvtRAAAP/j0xtN3nzzzbr55ptrPGeM0Zw5czRt2jSNHTtWkrRgwQLFx8dr4cKFeuSRR2p8nc1mk812piYlLy/P/Q1voNqmmc6ul5GkqNBgxUaE6GRhqQ5lF6lnqxjvNhYAAD9xwdbMpKamKjMzUyNGjHAds1qtGjJkiNasWVPr62bOnKmYmBjXIykpyRvNrZfappnOrpep0JYiYAAAzumCDTOZmZmSpPj4+CrH4+PjXedqMnXqVOXm5roe6enpHm3n+ajYCC8jt6TKbQrOrpep0Da2PMywcR4AALXz6TRTfVgslirPjTHVjlVmtVpltVo93awGaRFd3q5Su1OnisrUNCJE0pl6mYEd46pc3/b0XjPcPRsAgNpdsGEmISFBUvkITcuWLV3Hs7Kyqo3W+AtrUKCrDuYv/9uv1k3DFGkNqlYvU6FimomRGQAAanfBhpn27dsrISFBycnJ6tOnjySptLRUK1eu1CuvvOLj1jVcm9hwnSws1V+/Ta1y/Ox6GYmaGQAA6sOnYaagoED79+93PU9NTdXWrVsVGxurNm3aaNKkSZoxY4Y6d+6szp07a8aMGQoPD9e4ceN82OrG+d3tPfXxliMqKLGrsNSuQptdxWUO/bhfUrXps4pppqO5xbLZHbIGBfqiyQAAXNB8GmY2btyo66+/3vV88uTJkqQJEybovffe05QpU1RcXKyJEycqJydHAwYM0LJlyxQVFeWrJjdaz1Yx9V5mHRcRooiQQBWWOpR+slidzhq5AQAAksVUXlZzEcrLy1NMTIxyc3MVHR3t6+act5tfW6VdGXn664P9dUM3/6wVAgDgfJ3P5/cFuzQb5dqdrpvh7tkAANSMMHOB4+7ZAADUjTBzgau4ezZ7zQAAUDPCzAWuHXvNAABQJ8LMBa5imik9p0gOZ+212sYYPfPhVj31jy1y1nEdAAAXG8LMBa5lTJhCAgNU5jDKyC2u9brDOcX6eMsRLd12VOk5jOIAAC4dhJkLXGCARa1jwyTVPdW0MyPP9ee9xwo83i4AAC4UhBk/UJ+7Z++qEmbyPd4mAAAuFIQZP+C6e/bJ2lc07TxKmAEAXJoIM37AdffsOjbO25XJNBMA4NJEmPEDrjBTy8Z5+SVlSj95pjj4h+MFsjucXmkbAAC+RpjxA+1OTzOlnihQWQ0hZXdm+bRSfLRVYcGBKrU7aw0+AABcbAgzfqBdXISahAerpMyplCO51c5X1Mv0TIxx3Vl7H3UzAIBLBGHGDwQEWDSgfawkae2B7GrnK1YydW8Zrc7x5WFmTyZ1MwCASwNhxk9c3SFOkrT2wMlq5yqHma7xUZKkvVmMzAAALg1Bvm4A6qcizGw8eFJlDqeCA8tzqN3hdNXM9EiMVnhIoCSmmQAAlw5GZvxE1/goNQ0PVlGpQ9sPn6mbOZhdKJvdqfCQQLWNDVeXhPKRmQPHC1VqZ0UTAODiR5jxE+V1MxVTTWfqZnZmlI/AdE2IUkCARYkxoYq0BsnuNDqYXfsmewAAXCwIM37k6g7Vi4ArVjL1aBktSbJYLK4VTewEDAC4FBBm/MjVHSvqZnJc+81ULv6t4CoCziTMAAAufoQZP9KlRXndTHHZmbqZijDTI/FMmKlYns1tDQAAlwLCjB85u27mRIFNWfk2WSxSt9OFv1J5/YzENBMA4NJAmPEzletmKkZl2sVFKDzkzCr7LqenmQ5mF6qkzOH9RgIA4EWEGT9TuW6mYqqpR6V6GUlqEWVVdGiQnKZ8iTYAABczwoyfqVw389HGdElS95ZRVa6xWCwXxFRTWnaRXvlit47llfisDQCAix9hxs9Urps5mF1+Z+zuZ43MSFLneN+HmVe+2K15X/+gn8zfoOJSprsAAJ5BmPFDFXUzFSqvZKrgWp7thhVNTqfRP9anKeVw9Tt216bU7tTKvcclSTsz8jR18XYZYxrdFgAAzkaY8UMVdTOS1CQ8WAnRodWuObM8u/EjMx9tStfUxSm66/++06ZDOfV6zbrUbBXY7IqyBikwwKIlW4/qr98ebHRbAAA4G2HGD1XUzUhS94RoWSyW6tecHplJzylSUam9wV+rpMyhV5P3SZKKyxx66L0N9QpIX+3KkiSNurylpo3qLkma8dkurfnhRIPbAgBATQgzfqhy3UxNU0yS1CzSqriIEBkj7c+qfarJGKP1qSeVX1JW4/n31hxUZl6JWjUJU++kJsotLtMD767X4ZyiOt9z+a5jkqQbu7fQTwa30x19WsnhNHpi4RYdOVVc378qAADnRJjxU88M76JberXUQ9e0r/Wa+uwE/Pe1h3TX/32nH7/5nQpsVUdwcovK9MaK/a6vN//BK9WpRaQy80r0wLvrlV1gq/E99x4r0OGcYlmDAnRN52ayWCyaccfluiwxWicLS/XI3zeq0Fb7aNHmtBx9uj1DDic1NgCAcyPM+KmuCVGaO66vWjUJq/2a01NNtdW5ZOaWaNYXeyRJuzPz9eTCzbKfvueTJM1b+YPySuzqGh+lO/q0UtOIEP39p1epVZMwHThRqAfnb6gxlFSMygzu1My1mV9YSKD+7/5+io0I0fdH8vTo+5tUandWe+0X32fqrje/0+MLN+vON9dod2ZePXsEAHCpIsxcxIZ2bSFJWrQhTd+cXllU2Yv/3aECm11d4iMVGhygFXuO68X/7pQxRpm5JZr/baok6bmRXRUYUF6X0zImTH/76VWKjQhRypFc/fmrfdXet/IUU2Wtm4br3Qn9FRYcqFX7TmjKv7bJWWn0ZdmOTD2xcLPsTqMAi7Ql7ZRu+fNq/eHL3exkDACoFWHmInZ9txa696o2Mkaa9OFWZeae2bxu+c5j+vz7TAUGWPTaPX005+7esljKp53++u1BvfbVXtnsTvVv27RaKOnYPFJ//HEvSdL8bw8qLftM/cyJApu2pp+SJN3YLb5am/q0aap59/VV0OkVTr//bJeMMUreeUyPnw4yt16RqFXP36CRl8XL7jR6fcUPumnON1qxO4vl3QCAaggzF7kXxvRQj5bltSpP/qN8Gqmo1K4Xlu6QJP3s2vbq3jJaN/Vsqak3d5Mk/e7TnfpwQ/nuwr+8uVuNq6Wu79pC13ZuplKHUy9/sct1/H+7s2SM1LNVtBJiqi8Zl8pHjP5wOgy9uzpVv/jnNk38YJPKHEa39Gqp2XddoVZNwvR/9/fXm/f1U3y0VQezi/ST9zbo7v9bq40HT7q1jwAA/o0wc5ELDQ7UG+P7KtIapA0Hc/THZXs1Z/k+HTlVrNZNw/T0jZ1d1/782g6ukRynkYZ1b6H+7WJrfF+LxaJpo7srwCJ9lpKp9anlAeOriimmGkZlKrujT2v9enT5ku3FW46ozGE0+vKWmnN3bwUFnvm2vKlngpInD9HD13VQSFCA1h88qTvf/E4/W7CBehoAgCTCzCWhXbMIvfKj8pGQN1f+oHdWHZAkvXRbzyp327ZYLPrtbZdpeI94RYUG6ZenR2pq0y0hWndf2UZS+WhOSZlDq/aV7yMzrHvdYUaSfnZtBz02tKMkaXSvlppzT9UgUyE6NFi/GtVdK58bqnuvSlJggEXLd2Xplj+v1oo9WfXoAc9LP1mkn8xfry++z/R1UwDgkmMxF3kRQl5enmJiYpSbm6vo6Jr3ZLlUvPCf77Xgu0OSpNGXt9Tr4/vWeJ0xRk4jV9FvXY7n23T9H79Wgc2u23snasnWo4qPtmrt1BtrnJ6qSVZ+iZpHWut9/YHjBXrpk51asee4oqxB+vjxQerUIurcL/QQY4zufXut1h44qZiwYH3z3PWKOb2pIRrP7nDKZncqwhp07ovhN5xOo5QjueqRGK3gGn6JAc7n85vvoEvIr0Z319UdYpUYE6r/N6ZHrddZLJZ6BRlJah5l1ePXd5IkLdl6VJJ0Q7f4egcTSWoRFXpe13doHqn/u7+/rmoXq3ybXT9dsFE5haVVrnE4jf66OlU//9tGHTxRWOt7bTh4Uk8v2qKdRxs+ZfXPjelae6B8mi23uExvfL2/we/lTvuzCrTBz+uLSu1OjXtnna54cZmmLk5RRq5/bLhojFHK4VzlFtW8GaU72R1Ofbv/hNYeyPabAnmH0+gXH23Tba9/q4kfbPbK10w/WaTUE4V+00eN8c3e43pz5Q/KLfb899+FgpGZS4wx5ryCQ32UlDk0bPZKHc4p/6B5d0J/3ViPaabGyi6w6da53+rIqWIN6hinBQ9dpeDAAKVlF+nZj7Zp/ekP8nZx4Vo8cbBiI0KqvH7H0Vzd9eZ3Kix1qEl4sP7x86trvAN5XbLySzTsTyuVV2LXsO4ttHxXlkKCAvS/XwxR66bhbvu7nq+Uw7m68801stmdmvWjXrrryiS3vv+aH07owPFClTmcpx9GYcGB+lHf1m4dlfr9pzv19qpU1/OQoADdN6CtJl7fUc0irZKkolK7jufbZEz5lGp9GGO042ievtl3XNGhwRrbt1WVKdfGSD9ZpBeW7tD/dmepWWSIXvlRL7f/f3A6jTal5Wjp1qP6LCVD2afDfL+2TfWrUd3Ur23NtW4XAofT6NmPtunjLUdcx955oL+G9fDMzwxjjN5fe0i//WSnyhxGSbFhGtKluYZ0aaFBHeM8MuKXX1Imh9OoSXjIuS92s/1Z+Rr12mqVOpyKiwjR8zd10539Wiugnr+gXkjO5/ObMAO3+GT7UT2xcIvCggO15f8NV2hwoFe+7u7MPP3ojTUqLHXo/qvb6rLEaL30yU4VljoUERKoyNAgHcuzqX/bpnr/ZwNc7Tp6qlh3vPGtjuXZFBIY4PqPv+jhq9U5vuqU1YaDJ7V482HdekUrDax0k09JevyDzfo0JUOXt4rRxxMH6f531+u7A9m6o08rvXp373r/PcocTu3JzNeW9FP6/nCu+rZt4qpHOl9Z+SW6be63yji9FD8wwKJ3J/R37TtUwRij99elaU9mnp4b2U0xYecOIWUOp176ZKf+dnq68mytmoRp3n191at1kwa1vbKvdh3TTxdslCRNuamrvt5z3FVoHh4SqBZRVh3Pt6mw9MweRFe1i9Vj13fU0C7Nq4X24lKH1qVm66tdWfpq1zEdrbRVQVxEiH52bQfdP7CtIhv44VbmcOqvq1M1Z/k+FZ+1L9K9V7XRr0d3b/QHZ0mZQx+sS9NfV6dWuS1IbESIiksdrq97c88ETbmpm1o3DVNmbokyckuUkVssi8Wiazs1U9MI73/ISqdHZP65VUu2HlVQgEVXd4jT6v0nlBQbpuRnhtTr50ap3anjBTa1jA495wd0SZlDv/o4RYs3lwenAEv54oYKIUEBmvWjXrq9T6sG/52KSu16f+0h7Tyap0Mni5SWXaTswlJZLNIdfVpp8vAu5/zFxu5wandmvr4/kqtOLSJrXXhxLg6n0Z1vrtGWtFOun2uSdEVSE71462XqndSkQe8rlff79sOntPFQjqJCg3TTZQmKO/0LhacQZiohzHiHMUb/WJ+uVk3Lf+vxpmU7MvXI+5tU+Tv5qnax+tNdV6ikzKGx89Yov8SuW3q11J/v6aOCUrvuevM77c7MV5f4SL074Uo9+v4m7Tiap+ZRVn348NXq0DxS6SeL9PIXu/Xp9gzX+44f0EZTR3VXpDVIy3ce08/+tlGBARb95/HB6tkqRimHczVm7mpJ0idPXqOerWJqbLPDabTt8Cmt2J2lNT9k6/sjubKdtSPy3HF9dEuvxPPqC5vdoXvfWqvNaafUoXmEeibGaOm2owoPCdQ/Hxnoas+polI9+9E2LT99Q9DeSU30959epajQ2gNNTmGpHl+4WWt+yJYk3dCthSKsQQoOsCg4MEDfHchW2skihQQG6Ddjeui+AW2qBIq8kjJ9fzhXgQEWRViDTj8CFRseUq3wOyO3WDe/tkqnisr0k8Ht9MKYy2SM0er9J/THL/do2+HcKteHBQeqzOGU/fQnVfeW0XpsaEc1iwzR2h+ytfbASW1NP+X64V7xmsGd4rT3WIHSTpbvldQkPFg/Hdxe469uW20krzZ2h1Or95/Qy5/v1u7M8puwDmgfqxfGXKbFmw/rndXlI0vt4sL1yo96qVOLSIUEBZQ/AgPqNVJaanfqw43pmvu/fTqWV34bkUhrkEZelqBbeydqcMc4ZReWavayvfpoU7qcpvyD20g6+yd8YIBFV7WL1YjL4jXisoQ6dxGvr6z8EgUHBNQZkuwOp37x0Tb953SQmTuur67t3Ew3/mmlMvNK9MywLnp6WOdaX19os+sf69P09qoDOpZnU6Q1SD1bRatX6ya6vFWMOsdHqnXTcFcYTT9ZpEf+vkk7M/IUYJGm3txd4wa00Xc/ZGvl3uP6em+W0k8WKyQwQB8+crX6tGl63n/v74/k6qlFW3TgeO1T2SGBAbp/YFs9fn0nxUaEqNTuVOqJQu05lq+dR/O0JS1H2w/nVgnAD1/XQVNGdq1xQURd3ll1QL/7dJeirEH67Olr9cX3mXrtq32uW9XER1sVEhSg4MDy771mkVaNG9BGIy9LqLG0YN+xfH3+fabWHsjW5rQclZSd+f8TGGDR4E7NdOsViRpxWbyi6/jZ0VCEmUoIM5eG11fs1x++3KOQwAA9N7KrHrqmves/55r9J/TAX9fL7jR65LoO2nE0T6v3n1CLKKs+fnywWjUJU05hqe59e612Z+YrITpUY65oqQXfHVKp3SmLpTwcrTs9KtCqSZh+c0sPvfjfHcrILdEjQzpo6s3dXW15etEW/WfrUQ3qGKcPfjbA9WFlszv01a4sLd95TF/vPa6TZ9X5RIcG6YqkJgoJDNBXu7MUFhyofz02UJcl1hyIzmaM0ZR/bddHmw4rOjRISx4frNZNw/XQexu0ev8JNYu06uOJg5RdWKrHP9isI6fKf5CHBgcor8Su/m2basFDV9U4erDvWL5+9reNOpRdpIiQQL16d2+NuCyhyjW5xWV67qNtWrazfHn+rVckasKgtlqzP1vf7DuuzWmnarzfVkxYsMYPaKMHB7VTi+hQ2R1O3fv2Wm04mKPLW8XoX48NlDXozG/sxhhtST8lu8OoeZRVzaOsirQGKTO3RO+sOqCF69NUVFrzjtEtY0J1fbcWGta9hQZ1bKbQ4EDZHU79Z+tRzV2xX6mn66uCAy0a3iNed/VP0rWdm1f7Qe9wGm04eFL/3XZUX3yf6ZrmaRIerGmjuuvOfq1d/+5r9p/QLz7a5hopO1tIYICswQGyBgXKGlT+79E0PERNI0IUGx6i6LAgfZaS6RqJSYwJ1ZM3dtYdfVrVOJKxJzNfMz/fpa/3HHe9f0JMqFrGhCq3uMwVuCo0DQ9Ws8jyfmwWaVW7ZhGaMLBtvX/r/ueGdP16yfcKtwbqX48OrLEY3+E0mvzPrVWCzE09y79//rvtqJ78xxZZgwK0fPIQJcVWHcU4VVSqBWsOaf6aVJ2qRw1Sk/BgtW4apvSTxcotLlNcRIj+Mq6PBnVsVuU6p9Po0fc3adnOY4qPtuq/T1yjFtFV98YqtTu14eBJNYu0qnOLSNdIkNNp9NdvU/XKF7tV5jCKj7bqgYHt1L5ZhNrEhqttXLhSTxTq5c93u8J/lDVICTGhSj1R6ArdlUVZg9QpPlJb0k5JKg/EfxnXRy2iat6v62yHsgs1cs43KilzasYdl2vcgPKR3ay8Er38+W4trjStd7YOzSL0yJAOur1PK9kdRp9sP6pFG9JdbakQGxGiK9s11dFTJUo5cuYXipCgAP382vZ6bmTdK2DPF2GmEsLMpcEYo2/2nVDb2PAa6yb+temwnv1om+v52SMVUvnuxfe8tbbKXcYHdojTb27poR6J0Vqz/4Sm/Hu7qzZIktrEhuvLSdcpLOTMh0r6ySLd+KeVKnU4Nf8nV6pT80h9sC5NH21Md33oSeU/vK7r0lxDujZX/7ZN1S4uQgEBFjmcRg+9t0Er9x5XqyZhWvrE4Hp9sPx1dap++8lOBVik935yla47PUKWX1KmH58eiWoZE6oTBTaVOYzaxIbrjdMr2sa9vVZ5JXZd1T5W7/3kSlf9yPF8m/677ahmJ+9Vgc2u1k3D9M6E/uqWUPP/JWOM3l2dqpmf764xuCTFhik4IEAFNrsKbfYqU0TBgRbdekUrWYMDtHBdmqKsQfrkqWvUNq5+dTAVcgpLteC7g/pgXZoCLOX/hld3iNPAjnFqExte60iIw1n+Q/ztVQf0/ZEzBeGJMaEa0CFO+SV25ZWUKa+4TMfySpRT6YM1NiJEt16RqCdv6FTjv1VuUZle/O8OfZKSUeM9yeqjxeli+3uuSqoS7mpTEVbjIkKqTMekZRdp2c5MfbkjUxsP5VQbuZHKA+ZzI7vq3qva1LoYoKYpx1ZNwrR44iDFVwoFTqfRs//apsWbjygowKLXx/fVyEpB2Bij8e+s05ofsjW8R7zefqC/pPIQ886qVM3/NtX1fdIuLlyPDe2oW69opUMnC7X9cK62Hz6llCN5OpRdWC3sXJHURPPG91ViLaNPBTa77nj9W+3LKlDfNk30j4evdvXtjqO5evaj7dqVUf690DQ8WFe1j9WA9nH6eu9x1y1ihveI16wf9apxVMoYo1X7ykftdmac+Z6KsgapS0KUusRHqU9SE/Vp00Qdm5eHpc9SMvTcR9tUWOpQiyir5o7rq/hoq3Zl5GtPZr72HstXVGiQ7h/Y1vWLjtNpNO6d8hWVZ/8SVSEjt1jZBaUqdThVZi+vc1ufmq0F3x1yFQo3j7Kq0GZ3/TIQGGDR9V2ba0iX5hrQIU6dW0S63vfA8QJ9sj1DS7cd1f6sAv3mlh76aR03Pm4IwkwlhBlUeDV5r177ap8CAyx6Z0J/XX9WDYlU/lvMQws2yFbm1HMju2p4j6orswptdv3hyz16b81BSdL7Px2gazo3q/Y+FYWrUdYgFZTaXR8Y8dFW3da7la7v2kL92zWtdUlqblGZbn/jW6WeKNRV7WP1/k8HKCQowHVu5b7j+iGrQMcLbDqRb9PxApu2pZ+S00i/Ht1dP7u2Q5X3y8wt0dg3vnXVidx0WYJm/biXa2h4W/op3ffOOuXb7BrUMU63926lpduOas0PJ1w1BgPax2reff3qNf2y8eBJPfev7Tqeb9OgjnEa0rW5ruvcvNpv3Q6n0fJdx/T2Nwe08awbojZkms1ddhzN1UcbD+vjLUdqXRESHRqkm3om6JZeiRrUMa7eUwJOp1Gpw1n+sJc/bHanbHaHSsrKd+g+VVSmk4Wlyiks1cmiUrVvFqG7+ie5vRYtt7hMmbklOp5v04kCm7LyS/TxlqOuD/BerWP00m09dcVZtRYnCmya+MFmVw3TE9d30mcpGTpwolDdW0brn49crajQYDmdRr/6OEWLNqQrMMCi18f10U09W1Zrx75j+br5tVWyO41eu6e3fjheqPmrU5V/enqkW0KUHr++k0Zd3rLOlZb5JWU6cqpYR3KKVeYwur5b83MGv4MnCnXr3NXKK7HrniuT9NvbeuqNr/dr7v/2y+40irIGye401eqgrEEB+s0tPTT+rOnUmjidRmt+yJbd6VTXhCglRNe9gnN/VoEefX9TlV+sanJt52Z6dEhHHThRqN8s+V5hwYH6ctJ1ahNX/8UHBTa7FlWavpOk9s0idPeVSRrbt9U5R4aMMa4RbXfXYhFmKiHMoIIxRh9vOaLEJmG6ukPcuV9Qhx1Hc1VU6tCVtRTqnSoq1XWzViivpPyH8bWdm2n8gLYa1r1FvT/09mcV6I7Xv1W+za47+7VW1/goLd91TBsP5dQ46iFJd/dP0ss/urzGH5T7juVrxme7dH23Frr/6rbVrtl0KEcPvLuuymiJVP7b7R29EzVuQFtXoKovp9PUexXFlrQcvbMqVZ9/n6GHBrfXr2+pffsAbykpK58aTDtZpJiwYEWHBSkmLFgxYcHqmhBVrxESf2N3OPX3tYc0e9le5dvsslikXq1iFBkapIiQ8lqndQeydTS3RJHWIM25u7eG9YhXWnaRxs77VicKSnVNp2Z698H+eumTnXp/bfkI2Wv39NGYK2oPp2evXJPKQ8wzw7toRI/z2+7hfH29J0s/eW+DjCkfXaqY0rvpsgS9dHtPxYQFK+VIrtalZmvdgZMKDgzQlJu6qku85/a3KrTZNXVxipZuO6qQoAB1iY9U1/hodU2I1PdH8vTJ9qM6+8fAC2N66CeDGzY6YrM79M3eE2oSHqz+bZt6tL/rizBTCWEGvrLx4EmtSz2p0Ze3rPeS4bP9b3f5ip6z/5d2jY9S37ZN1SLKqmZRVjWPtKpVkzD1bBXdqB9C61NP6rH3NykusnzaZMwViec9zdNYdofzvAsf4X5ZeSWa+fnuKkuoK+vQLEJvPdBfnVpEuo6lHM7V3W99p6JSh9rFhetgdpEsFmn2XVfojj6t6/x6BTa7bvzT1zqWZ1OX+Eg9M6yLRl6W4LUlxfO+/kGvfLFbUvmU0ou39dSYXi19/qF+qqhUkdagav8n0k8W6d3VqVq0IU0lZU71a9tUHz0y0C+XYNeGMFMJYQb+7q+rU/Vq8l71btNEN3ZroRu7x1ebrgE8Ze+xfB08UaiiUocKbHYVldoVEhigsf1a17iCZcWeLP1swUbX6OGsO3vprv712+foyKliHTpRqKs7xHn9Q9kYo1eX71N2gU2ThnVR8yjPLjt2l5OFpfpm73Fd37XFRbfzOGGmEsIMAHjXki1HNGf5Xk0c2sntGzbi0nE+n9/c7AQA4Fa392nVqI3ogPPFxDQAAPBrhBkAAODXCDMAAMCvEWYAAIBfI8wAAAC/RpgBAAB+jTADAAD8GmEGAAD4NcIMAADwa4QZAADg1wgzAADAr/lFmHnjjTfUvn17hYaGql+/flq1apWvmwQAAC4QF3yY+fDDDzVp0iRNmzZNW7Zs0bXXXqubb75ZaWlpvm4aAAC4AFiMMcbXjajLgAED1LdvX82bN891rHv37rr99ts1c+bMatfbbDbZbDbX87y8PCUlJdXrFuIAAODCkJeXp5iYmHp9fgd5qU0NUlpaqk2bNumXv/xlleMjRozQmjVranzNzJkz9eKLL1Y7npeX55E2AgAA96v43K7PmMsFHWZOnDghh8Oh+Pj4Ksfj4+OVmZlZ42umTp2qyZMnu54fOXJEPXr0UFJSkkfbCgAA3C8/P18xMTF1XnNBh5kKFoulynNjTLVjFaxWq6xWq+t5ZGSk0tPTFRUVVetrGqpiCis9PZ0pLA+jr72HvvYe+tp76GvvcVdfG2OUn5+vxMTEc157QYeZZs2aKTAwsNooTFZWVrXRmtoEBASodevWnmieS3R0NP85vIS+9h762nvoa++hr73HHX19rhGZChf0aqaQkBD169dPycnJVY4nJydr0KBBPmoVAAC4kFzQIzOSNHnyZN1///3q37+/Bg4cqLfeektpaWl69NFHfd00AABwAbjgw8zdd9+t7Oxs/fa3v1VGRoZ69uypzz77TG3btvV102S1WvXCCy9UqdGBZ9DX3kNfew997T30tff4oq8v+H1mAAAA6nJB18wAAACcC2EGAAD4NcIMAADwa4QZAADg1wgzDfTGG2+offv2Cg0NVb9+/bRq1SpfN8nvzZw5U1deeaWioqLUokUL3X777dqzZ0+Va4wxmj59uhITExUWFqahQ4dqx44dPmrxxWPmzJmyWCyaNGmS6xh97T5HjhzRfffdp7i4OIWHh6t3797atGmT6zx97R52u12//vWv1b59e4WFhalDhw767W9/K6fT6bqGvm6Yb775RmPGjFFiYqIsFouWLFlS5Xx9+tVms+nJJ59Us2bNFBERoVtvvVWHDx92TwMNztuiRYtMcHCwefvtt83OnTvN008/bSIiIsyhQ4d83TS/NnLkSDN//nzz/fffm61bt5rRo0ebNm3amIKCAtc1L7/8somKijL//ve/TUpKirn77rtNy5YtTV5eng9b7t/Wr19v2rVrZ3r16mWefvpp13H62j1Onjxp2rZtax588EGzbt06k5qaapYvX27279/vuoa+do/f/e53Ji4uznzyyScmNTXVfPTRRyYyMtLMmTPHdQ193TCfffaZmTZtmvn3v/9tJJmPP/64yvn69Oujjz5qWrVqZZKTk83mzZvN9ddfb6644gpjt9sb3T7CTANcddVV5tFHH61yrFu3buaXv/ylj1p0ccrKyjKSzMqVK40xxjidTpOQkGBefvll1zUlJSUmJibGvPnmm75qpl/Lz883nTt3NsnJyWbIkCGuMENfu8/zzz9vrrnmmlrP09fuM3r0aPPQQw9VOTZ27Fhz3333GWPoa3c5O8zUp19PnTplgoODzaJFi1zXHDlyxAQEBJgvvvii0W1imuk8lZaWatOmTRoxYkSV4yNGjNCaNWt81KqLU25uriQpNjZWkpSamqrMzMwqfW+1WjVkyBD6voEef/xxjR49WsOGDatynL52n6VLl6p///768Y9/rBYtWqhPnz56++23Xefpa/e55ppr9NVXX2nv3r2SpG3btmn16tUaNWqUJPraU+rTr5s2bVJZWVmVaxITE9WzZ0+39P0FvwPwhebEiRNyOBzVbnQZHx9f7YaYaDhjjCZPnqxrrrlGPXv2lCRX/9bU94cOHfJ6G/3dokWLtHnzZm3YsKHaOfrafQ4cOKB58+Zp8uTJ+tWvfqX169frqaeektVq1QMPPEBfu9Hzzz+v3NxcdevWTYGBgXI4HPr973+ve++9VxLf155Sn37NzMxUSEiImjZtWu0ad3x2EmYayGKxVHlujKl2DA33xBNPaPv27Vq9enW1c/R946Wnp+vpp5/WsmXLFBoaWut19HXjOZ1O9e/fXzNmzJAk9enTRzt27NC8efP0wAMPuK6jrxvvww8/1Pvvv6+FCxfqsssu09atWzVp0iQlJiZqwoQJruvoa89oSL+6q++ZZjpPzZo1U2BgYLUkmZWVVS2VomGefPJJLV26VCtWrFDr1q1dxxMSEiSJvneDTZs2KSsrS/369VNQUJCCgoK0cuVK/fnPf1ZQUJCrP+nrxmvZsqV69OhR5Vj37t2VlpYmie9rd3ruuef0y1/+Uvfcc48uv/xy3X///XrmmWc0c+ZMSfS1p9SnXxMSElRaWqqcnJxar2kMwsx5CgkJUb9+/ZScnFzleHJysgYNGuSjVl0cjDF64okntHjxYv3vf/9T+/btq5xv3769EhISqvR9aWmpVq5cSd+fpxtvvFEpKSnaunWr69G/f3+NHz9eW7duVYcOHehrNxk8eHC1LQb27t3rulku39fuU1RUpICAqh9rgYGBrqXZ9LVn1Kdf+/Xrp+Dg4CrXZGRk6Pvvv3dP3ze6hPgSVLE0+9133zU7d+40kyZNMhEREebgwYO+bppfe+yxx0xMTIz5+uuvTUZGhutRVFTkuubll182MTExZvHixSYlJcXce++9LKt0k8qrmYyhr91l/fr1JigoyPz+9783+/btMx988IEJDw8377//vusa+to9JkyYYFq1auVamr148WLTrFkzM2XKFNc19HXD5Ofnmy1btpgtW7YYSWb27Nlmy5Ytri1J6tOvjz76qGndurVZvny52bx5s7nhhhtYmu1rr7/+umnbtq0JCQkxffv2dS0fRsNJqvExf/581zVOp9O88MILJiEhwVitVnPdddeZlJQU3zX6InJ2mKGv3ee///2v6dmzp7FaraZbt27mrbfeqnKevnaPvLw88/TTT5s2bdqY0NBQ06FDBzNt2jRjs9lc19DXDbNixYoafz5PmDDBGFO/fi0uLjZPPPGEiY2NNWFhYeaWW24xaWlpbmmfxRhjGj++AwAA4BvUzAAAAL9GmAEAAH6NMAMAAPwaYQYAAPg1wgwAAPBrhBkAAODXCDMAAMCvEWYAAIBfI8wAcJt27dppzpw59b7+66+/lsVi0alTpzzWpgvJ+fYPgPoJ8nUDAPjO0KFD1bt3b7d9wG7YsEERERH1vn7QoEHKyMhQTEyMW74+gEsTYQZAnYwxcjgcCgo694+L5s2bn9d7h4SEKCEhoaFNAwBJTDMBl6wHH3xQK1eu1GuvvSaLxSKLxaKDBw+6pn6+/PJL9e/fX1arVatWrdIPP/yg2267TfHx8YqMjNSVV16p5cuXV3nPs6dRLBaL3nnnHd1xxx0KDw9X586dtXTpUtf5s6eZ3nvvPTVp0kRffvmlunfvrsjISN10003KyMhwvcZut+upp55SkyZNFBcXp+eff14TJkzQ7bffXuffd82aNbruuusUFhampKQkPfXUUyosLKzS9pdeeknjxo1TZGSkEhMT9Ze//KXKe6Slpem2225TZGSkoqOjddddd+nYsWNVrlm6dKn69++v0NBQNWvWTGPHjq1yvqioSA899JCioqLUpk0bvfXWW3W2G8C5EWaAS9Rrr72mgQMH6uc//7kyMjKUkZGhpKQk1/kpU6Zo5syZ2rVrl3r16qWCggKNGjVKy5cv15YtWzRy5EiNGTNGaWlpdX6dF198UXfddZe2b9+uUaNGafz48Tp58mSt1xcVFemPf/yj/v73v+ubb75RWlqann32Wdf5V155RR988IHmz5+vb7/9Vnl5eVqyZEmdbUhJSdHIkSM1duxYbd++XR9++KFWr16tJ554osp1f/jDH9SrVy9t3rxZU6dO1TPPPKPk5GRJ5SNUt99+u06ePKmVK1cqOTlZP/zwg+6++27X6z/99FONHTtWo0eP1pYtW/TVV1+pf//+Vb7Gn/70J/Xv319btmzRxIkT9dhjj2n37t11th/AObjl3tsA/NKQIUPM008/XeXYihUrjCSzZMmSc76+R48e5i9/+Yvredu2bc2rr77qei7J/PrXv3Y9LygoMBaLxXz++edVvlZOTo4xxpj58+cbSWb//v2u17z++usmPj7e9Tw+Pt784Q9/cD232+2mTZs25rbbbqu1nffff795+OGHqxxbtWqVCQgIMMXFxa6233TTTVWuufvuu83NN99sjDFm2bJlJjAw0KSlpbnO79ixw0gy69evN8YYM3DgQDN+/Pha29G2bVtz3333uZ47nU7TokULM2/evFpfA+DcGJkBUKOzRxQKCws1ZcoU9ejRQ02aNFFkZKR27959zpGZXr16uf4cERGhqKgoZWVl1Xp9eHi4Onbs6HresmVL1/W5ubk6duyYrrrqKtf5wMBA9evXr842bNq0Se+9954iIyNdj5EjR8rpdCo1NdV13cCBA6u8buDAgdq1a5ckadeuXUpKSqoyelXRFxXXbN26VTfeeGOdbancHxaLRQkJCXX2B4BzowAYQI3OXpX03HPP6csvv9Qf//hHderUSWFhYbrzzjtVWlpa5/sEBwdXeW6xWOR0Os/remNMtWOVnX3+bE6nU4888oieeuqpaufatGlT52srvpYxptrXPft4WFhYne8lnX9/ADg3RmaAS1hISIgcDke9rl21apUefPBB3XHHHbr88suVkJCggwcPeraBZ4mJiVF8fLzWr1/vOuZwOLRly5Y6X9e3b1/t2LFDnTp1qvYICQlxXbd27doqr1u7dq26desmqXwUJi0tTenp6a7zO3fuVG5urrp37y6pfNTlq6++avTfE8D5YWQGuIS1a9dO69at08GDBxUZGanY2Nhar+3UqZMWL16sMWPGyGKx6De/+Y1PRhSefPJJzZw5U506dVK3bt30l7/8RTk5OTWOmlR4/vnndfXVV+vxxx/Xz3/+c0VERGjXrl1KTk6usmLp22+/1axZs3T77bcrOTlZH330kT799FNJ0rBhw9SrVy+NHz9ec+bMkd1u18SJEzVkyBDXlNwLL7ygG2+8UR07dtQ999wju92uzz//XFOmTPFspwCXOEZmgEvYs88+q8DAQPXo0UPNmzevs/7l1VdfVdOmTTVo0CCNGTNGI0eOVN++fb3Y2nLPP/+87r33Xj3wwAMaOHCgq/4lNDS01tf06tVLK1eu1L59+3TttdeqT58++s1vfqOWLVtWue4Xv/iFNm3apD59+uill17Sn/70J40cOVJS+XTQkiVL1LRpU1133XUaNmyYOnTooA8//ND1+qFDh+qjjz7S0qVL1bt3b91www1at26dZzoCgIvFnGuyGQAuYE6nU927d9ddd92ll156qcHv065dO02aNEmTJk1yX+MAeAXTTAD8yqFDh7Rs2TINGTJENptNc+fOVWpqqsaNG+frpgHwEaaZAPiVgIAAvffee7ryyis1ePBgpaSkaPny5a4iXACXHqaZAACAX2NkBgAA+DXCDAAA8GuEGQAA4NcIMwAAwK8RZgAAgF8jzAAAAL9GmAEAAH6NMAMAAPza/wduLXnPExB81gAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9ce8e610",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79d68b45-b6da-4859-852a-3f3a30baaae0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
